** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

**********************************************************************************************************************************************************
*************************************************************** UNICAMP dataset **************************************************************************
**********************************************************************************************************************************************************

**********************************************************************************************************************************************************
*************************************************************** Paper's main dataset *********************************************************************	
**********************************************************************************************************************************************************

/*
The data we will use in our main analysis will have one observation per subject in Phase 2 (8 lines per student)
Phase 1 and Phase 2 scores will be normalized by subject and year, 0 to 1.
Trainees will be excluded from the sample. 
Besides, as we will only have P2 scores for the sample of p1 survivors, we will exclude students that did not pass to Phase 2.
*/

***********************************************************************************************************************************************

** Merging data: socioeconomics variables, p1 scores and p2 scores

use "Work data/work_data_SE.dta", clear
rename studentno inscri2

merge 1:1 inscri2 using  "Work Data/work_data_p1.dta", gen(mergep1)
so inscri2

*** Compatible essay scores
bys year: sum essay_st1
replace essay_st1=2.4*essay_st1 if year==2000
bys year: sum essay_st1
replace essay_st1=0 if essay_st1==. & attn_st1==1

* Creating variable of total score by subject in phase 1 per subject
d *1_st1 *2_st1
foreach x in biol chem hist phy geog math {
egen tot_`x'_p1=rowtotal(`x'1_st1 `x'2_st1)
replace tot_`x'_p1=. if `x'1_st1==.  | `x'2_st1==.
sum `x'1_st1 `x'2_st1 tot_`x'_p1
}

merge 1:1 inscri2 using  "Work Data/work_data_p2.dta", gen(mergep2)

/* Eliminate one applicant with an unknown career choice */
drop if career_choice==1 

/*	There were many career variables and they had small differences between each other. After consulting COMVEST, the correct variable should be curso and then
	career_st2 to reflect some last minute changes. The change to curso was already done.*/
replace career_choice=career_st2 if career_st2!=. 
replace career_choice=. if career_choice==0
replace curso2=. if curso2==0
replace curso3=. if curso3==0

/* 	Because of these career rearrangements, some of the groups may be wrong. This is only relevant for 2001-2003.*/
replace group=141 if year<2004 & 	(career_choice ==	2	|  career_choice ==	8	|  career_choice ==	10	|  career_choice ==	12	| career_choice ==	13	|  career_choice ==	29	|  career_choice ==	34	|  career_choice ==	40	| career_choice ==	42	|  career_choice ==	43	|  career_choice ==	49	|  career_choice ==	51)	
replace group=143 if year<2004 &  	(career_choice ==	11	|  career_choice ==	41)	
replace group=144 if year<2004 &  	(career_choice ==	9	|  career_choice ==	39)
replace group=145 if year<2004 &    (career_choice ==	62  |  career_choice ==	36	|  career_choice ==	37	|  career_choice ==	60)
replace group=146 if year<2004 &    (career_choice ==	5)
replace group=147 if year<2004 &    (career_choice ==	50)
replace group=148 if year<2004 & 	(career_choice ==	52	|  career_choice ==	55)
replace group=149 if year<2004 & 	(career_choice ==	48)
replace group=151 if year<2004 & 	(career_choice ==	7	|  career_choice ==	16	|  career_choice ==	18	|  career_choice ==	19	| career_choice ==	20	|  career_choice ==	38	|  career_choice ==	44	|  career_choice ==	57)
replace group=152 if year<2004 & 	(career_choice ==	30)
replace group=153 if year<2004 & 	(career_choice ==	17	|  career_choice ==	47)
replace group=154 if year<2004 & 	(career_choice ==	56)
replace group=161 if year<2004 & 	(career_choice ==	90)
replace group=162 if year<2004 & 	(career_choice ==	91)
replace group=163 if year<2004 & 	(career_choice ==	92)			
replace group=166 if year<2004 & 	(career_choice ==	23)
replace group=167 if year<2004 & 	(career_choice ==	25)
replace group=168 if year<2004 & 	(career_choice ==	26)
replace group=169 if year<2004 & 	(career_choice ==	93)
replace group=171 if year<2004 & 	(career_choice ==	6)
replace group=172 if year<2004 & 	(career_choice ==	14)
replace group=173 if year<2004 & 	(career_choice ==	15	|  career_choice ==	75)
replace group=174 if year<2004 & 	(career_choice ==	21	|  career_choice ==	81)
replace group=175 if year<2004 & 	(career_choice ==	27	|  career_choice ==	45)
replace group=176 if year<2004 & 	(career_choice ==	58)
replace group=177 if year<2004 & 	(career_choice ==	46)

/*	The music majors code has to be organized as they used the code 22 and were distinguished by the group */
foreach y in career_choice career_enroll{
replace `y' = 90  if `y' == 22 & group == 161
replace `y' = 91  if `y' == 22 & group == 162
replace `y' = 92  if `y' == 22 & group == 163
replace `y' = 93  if `y' == 22 & group == 169
replace `y' = 92  if `y' == 71
replace `y' = 93  if `y' == 72
replace `y' = 100 if `y' == 70 & year == 2004
replace `y' = 90  if `y' == 70 & year != 2004
replace `y' = 91  if `y' == 88
}

**	Generate a variable indicating if the major choice needs aptitude test 
count
sum career_choice
gen req_apt = (career_choice == 14 |career_choice == 23 | career_choice == 25 |career_choice ==26 |career_choice == 48 |career_choice == 90 | ///
career_choice == 91 |career_choice == 92 |career_choice == 87 |career_choice == 93| career_choice == 100 )
tab req_apt

***********************************************************************************************************************************************

*****************************************************************************						
* PHASE 1	- Data cleansing and creating new variables																
*****************************************************************************

* For 2000, the variable is named enem2. 
replace enem = enem2 if year == 2000
tab year, sum(enem)
compare enem enem_score
drop enem_score

* Creating dummy variable for whether an applicant was eliminated
gen ELIM_1 = 0
replace ELIM_1 = 1 if (gen_ques_st1==0 | gen_ques_st1==. | essay_st1==0 | essay_st1==. | essay_corr == 0 | attn_st1==0)
tab ELIM_1

*** Sample restrictions

* Drop all applicants that were eliminated from Phase 1 
drop if ELIM_1==1

* Create the final grade score that will be used by only those who passed Phase 1. Since this is the case, we drop all students who did not pass Phase 1.
drop if pass_st1==0

* Creating variable P1 total scores

/* 	We use pass_st1 from their database, but calculate total_st1.
	Since the variable total_st1 is calculated in different ways for different years, we cannot use it for predicting who passed Phase 1.
	We rather create a variable NP that is based on the announced criteria.
*/

gen double NP1_noENEM=gen_ques_st1+essay_st1 // P1 scores, without ENEM scores (descriptive statistics)

gen double NP=gen_ques_st1+essay_st1
gen double NENEM = 0.8*NP+0.2*enem
replace NENEM = 0.8*NP+0.2*0.96*enem  if year == 2008
count if (NP<NENEM & NENEM!=.) & aa_year==0 & year>2000
count if aa_year==0
replace NP=NENEM if (NP<NENEM & NENEM!=.) 
sum NENEM NP

* Creating standardized P1  scores

/*	Standardize score Phase 1 among those who attended the exam and passed Phase 1.
	Note that the standardization was done within areas until 2003.
	After 2003, it was done within all students who passed Phase 1.
*/

gen double NP_st1 = .
foreach y in 0 1 2 3 { // included year 2000
foreach z in 1 2 3 4 {
sum NP if year == 200`y' & area_st2 == `z'
scalar M_st1_`z'_`y' = r(mean)
scalar D_st1_`z'_`y' = r(sd)
replace NP_st1 = (((NP - M_st1_`z'_`y' )*100)/D_st1_`z'_`y')+500 if year == 200`y' & area_st2 == `z'
}
}

foreach y in 4 5 6 7 8{
sum NP if year == 200`y'
scalar M_st1_`y' = r(mean)
scalar D_st1_`y' = r(sd)
replace NP_st1 = (((NP - M_st1_`y' )*100)/D_st1_`y')+500 if year == 200`y'
}

bys year: sum NP_st1
count if NP_st1==.

***********************************************************************************************************************************************

*****************************************************************************																		
* PHASE 2 AND APTITUDE TEST	- Data cleansing and creating new variables														
*****************************************************************************

* There are observations for careers not requiring aptitude test, where the applicant is said to be absent from the aptitude test
tab aptd_attn_st2 req_apt 
replace aptd_attn_st2=1 if req_apt==0 

/* 	
We have grades in the aptitude test for students who did not need to take the aptitude test.After verifying, we can see that these are students who
initially chose majors with an aptitude test, but then were moved to another major. 
There is no need to change these observations, but one must be careful that only the grade of those applying for a major requiring an aptitude test have to be taken into account.
*/

/*	
The applicant fails if they did not attend exam or received zero in either part of the exam. 
Until 2006, the students needed at least 30 points in the aptitude test. Starting in 2007, they just need a grade larger than 0 in the aptitude test in order not to be eliminated.
I consider that the standardization is done within all students who attend a given exam.
If the student is not present in one exam, but is present in another, her grade is considered for the exam she was present.
*/

** Creating a variable that indicates whether the student was eliminated from Phase 2. 
* This happens if they did not attend an exam or got zero (or less than 30 in the aptitude test until 2006).

* Eliminating students that did not attend an exam
gen ELIM_2 = 0  
foreach x in port biol chem hist phy geog math lang {
replace ELIM_2 = 1 if `x'_attn_st2 == 2
}
replace ELIM_2 = 1 if aptd_attn_st2 == 2 & req_apt==1 // did not attend aptitute test in careers that they are required to

* Eliminating students that got zero
foreach x in port biol chem hist phy geog math lang{
replace ELIM_2 = 1 if `x'_tot_st2 == 0
}
replace ELIM_2 = 1 if aptd_st2==0 & req_apt==1 // got zero in the aptitute test

* Eliminating student that got less than 30 in the aptitude test until 2006 ( 50% of the test)
bys year: sum aptd_st2 if req_apt==1
tab year if aptd_st2<30 & req_apt==1
replace ELIM_2 = 1 if aptd_st2<30 & req_apt==1 & year<=2006

tab ELIM_2

** Generating the standardized scores 

* Regular disciplines

foreach x in port biol chem hist phy geog math lang {
gen `x'_NP_st2 = .
}

foreach y in 0 1 2 3{ //included year 2000
foreach x in port biol chem hist phy geog math lang{
foreach z in 1 2 3 4{
sum `x'_tot_st2 if (year == 200`y' & area_st2 == `z' & `x'_attn_st2 == 1)
scalar M_`x'_`z'_`y' = r(mean)
scalar D_`x'_`z'_`y' = r(sd)
replace `x'_NP_st2 = (((`x'_tot_st2 - M_`x'_`z'_`y' )*100)/D_`x'_`z'_`y')+500 if area_st2 == `z'& year == 200`y'
}
}
}

foreach y in 4 5 6 7 8{
foreach x in port biol chem hist phy geog math lang{
sum `x'_tot_st2 if (year == 200`y' & `x'_attn_st2 == 1)
scalar M_`x'_`y' = r(mean)
scalar D_`x'_`y' = r(sd)
replace `x'_NP_st2 = (((`x'_tot_st2 - M_`x'_`y' )*100)/D_`x'_`y')+500 if year == 200`y'
}
}

* Standardized score for the aptitude exam
gen apt_NP = .
foreach y in 0 1 2 3 4 5 6 7 8 {
foreach i in 14 23 25 26 48 87 90 91 92 93 100{
sum aptd_st2 if (career_choice == `i' & year == 200`y' & aptd_attn_st2 == 1)
scalar Mapt_`i'_`y' = r(mean)
scalar Dapt_`i'_`y' = r(sd)
replace apt_NP = (((aptd_st2 - Mapt_`i'_`y' )*100)/Dapt_`i'_`y' )+500 if year == 200`y' & career_choice == `i'
}
}
count if req_apt==1
sum apt_NP
bys year: sum apt_NP

* Number of missing values (omitted answers) in a subject
foreach x in port biol chem hist phy geog math lang {
egen totalmissing`x'=rowtotal(missing_p2_`x'*)
replace totalmissing`x'=. if year>2002
egen missing1to4`x'=rowtotal(missing_p2_`x'1 missing_p2_`x'2 missing_p2_`x'3 missing_p2_`x'4)
egen missing5to8`x'=rowtotal(missing_p2_`x'5 missing_p2_`x'6 missing_p2_`x'7 missing_p2_`x'8)
egen missing9to12`x'=rowtotal(missing_p2_`x'9 missing_p2_`x'10 missing_p2_`x'11 missing_p2_`x'12)
replace missing1to4`x'=. if year>2002
replace missing5to8`x'=. if year>2002
replace missing9to12`x'=. if year>2002
sum totalmissing`x' missing1to4`x' missing5to8`x' missing9to12`x'
}

* Zero values that are not missing  in Phase 2 questions
foreach x in port biol chem hist phy geog math lang {
forvalues i=1(1)12 {
gen zero_p2_`x'`i'=0 if `x'_attn_st2==1
replace  zero_p2_`x'`i'=1 if `x'`i'_st2==0 & missing_p2_`x'`i'==0 & `x'_attn_st2==1
tab year, sum(zero_p2_`x'`i')
* Missing values only in years from 2000 to 2002
replace zero_p2_`x'`i'=. if year>2002
tab year, sum(zero_p2_`x'`i')
}
}

foreach x in port biol chem hist phy geog math lang {
sum zero_p2_`x'*
egen `x'_zerostotal_st2=rowtotal(zero_p2_`x'*)
replace `x'_zerostotal_st2=. if year>2002
egen `x'_zeros1to4_st2=rowtotal(zero_p2_`x'1 zero_p2_`x'2 zero_p2_`x'3 zero_p2_`x'4)
egen `x'_zeros5to8_st2=rowtotal(zero_p2_`x'5 zero_p2_`x'6 zero_p2_`x'7 zero_p2_`x'8)
egen `x'_zeros9to12_st2=rowtotal(zero_p2_`x'9 zero_p2_`x'10 zero_p2_`x'11 zero_p2_`x'12)
replace `x'_zeros1to4_st2=. if year>2002
replace `x'_zeros5to8_st2=. if year>2002
replace `x'_zeros9to12_st2=. if year>2002
sum `x'_zerostotal_st2 `x'_zeros1to4_st2 `x'_zeros5to8_st2 `x'_zeros9to12_st2
}

* At least one zero in a subject
foreach x in port biol chem hist phy geog math lang {
gen any_zero`x'=1 if `x'_zerostotal_st2>0 & `x'_zerostotal_st2~=.
replace any_zero`x'=0 if `x'_zerostotal_st2==0
}

* Total zeros + total missing - Creating variables
foreach x in port biol chem hist phy geog math lang {
egen total_zeros_missing`x'=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2), values(0)
egen total_zeros_missing_1to4`x'=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2) , values(0)
egen total_zeros_missing_5to8`x'=anycount(`x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 ) , values(0)
egen total_zeros_missing_9to12`x'=anycount( `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2) , values(0)
sum total_zeros_missing`x' total_zeros_missing_1to4`x' total_zeros_missing_5to8`x' total_zeros_missing_9to12`x'
replace total_zeros_missing`x'=. if `x'_tot_st2==.
replace total_zeros_missing_1to4`x'=. if `x'_tot_st2==.
replace total_zeros_missing_5to8`x'=. if `x'_tot_st2==.
replace total_zeros_missing_9to12`x'=. if `x'_tot_st2==.
sum total_zeros_missing`x' `x'_zerostotal_st2 totalmissing`x' if `x'_attn_st2==1 & year<=2002
}

* Standard deviation in a subject (std dev from scores in all 12 itens)
foreach x in port biol chem hist phy geog math lang {
egen `x'_stand_dev_st2= rowsd(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2)
replace `x'_stand_dev_st2=. if `x'_tot_st2==.
sum `x'_stand_dev_st2
}

* Average score in a suject
foreach x in port biol chem hist phy geog math lang {
egen `x'_mean_st2= rowmean(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2)
replace `x'_mean_st2=. if `x'_tot_st2==.
sum `x'_mean_st2
}

* Coefficient of variation in a subject
foreach x in port biol chem hist phy geog math lang {
gen coef_var`x'=`x'_stand_dev_st2/`x'_mean_st2
sum coef_var`x'
}

* Number of "perfect scores" each student scored in a subject
foreach x in port biol chem hist phy geog math lang {
bys year: sum `x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2

* Years 2000 to 2007 - max score =5
egen `x'_maxtotal_st2=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2) if year<=2007, values(5)
egen `x'_max1to4_st2=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2) if year<=2007, values(5)
egen `x'_max5to8_st2=anycount(`x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 ) if year<=2007, values(5)
egen `x'_max9to12_st2=anycount( `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2)if year<=2007, values(5)

* Year 2008 - max score = 4
egen `x'_maxtotal_st2_2008=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2 `x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2) if year==2008, values(4)
egen `x'_max1to4_st2_2008=anycount(`x'1_st2 `x'2_st2 `x'3_st2 `x'4_st2) if year==2008, values(4)
egen `x'_max5to8_st2_2008=anycount(`x'5_st2 `x'6_st2 `x'7_st2 `x'8_st2 ) if year==2008, values(4)
egen `x'_max9to12_st2_2008=anycount( `x'9_st2 `x'10_st2 `x'11_st2 `x'12_st2) if year==2008, values(4)

replace `x'_maxtotal_st2=`x'_maxtotal_st2_2008 if year==2008
replace `x'_max1to4_st2=`x'_max1to4_st2_2008 if year==2008
replace `x'_max5to8_st2=`x'_max5to8_st2_2008 if year==2008
replace `x'_max9to12_st2=`x'_max9to12_st2_2008 if year==2008

drop *_st2_2008

foreach v of varlist `x'_max* {
replace `v'=. if `x'_tot_st2==.
}
sum `x'_tot_st2 `x'_max*

}

* At least perfect score in a subject
foreach x in port biol chem hist phy geog math lang {
gen any_maxscore`x'=1 if `x'_maxtotal_st2>0 & `x'_maxtotal_st2~=.
replace any_maxscore`x'=0 if `x'_maxtotal_st2==0
}

***********************************************************************************************************************************************

*****************************************************************************																		
* FINAL SCORE: FIRST CHOICE ONLY - Creating final score variables									
*****************************************************************************

***	Calculate wNPO, wNPO_aa and wNPOwo (final score variables)

**	Identification of priority discipline
merge m:1 career_choice using "Original data/priority_discipline.dta", gen(mergepriority)
drop career1 career2 career3 

** Adjust for changes in the priority disciplines for certain careers 

* Philosophy: in 2000 and 2001, Portuguese and Mathematics were priority disciplines. Starting in 2002, only Portuguese was a priority discipline.
tab1 prior_math prior_port if career_choice==30
replace prior_math=1 if (year==2000 | year==2001) & career_choice==30

* Dentistry: in 2000 and 2001, it required Biology and Chemistry as priority disciplines. Starting in 2002, only Biology was required.
tab1 prior_bio prior_chem if career_choice==14
replace prior_chem=1 if (year==2000 | year==2001) & career_choice==14

* Physical Education: Until 2002, only Biology was priority discipline. Starting in 2003, Biology and History.
tab1 prior_bio prior_hist if career_choice==27 | career_choice==45
replace prior_hist=0 if (year==2000 | year==2001 | year==2002) & career_choice==27
replace prior_hist=0 if (year==2000 | year==2001 | year==2002) & career_choice==45

* Arts:	in 2000, only Portuguese was priority discipline
tab1 prior_hist prior_port if career_choice==25
replace prior_hist=0 if year==2000 & career_choice==25

** First, calculate the sum of the grades of priority disciplines and then the number of priority disciplines required in that career choice.

gen pri_NP_st2 = port_NP_st2*prior_port +biol_NP_st2*prior_biol+ chem_NP_st2*prior_chem ///
+hist_NP_st2*prior_hist+ phy_NP_st2*prior_phy + geog_NP_st2*prior_geog + ///
math_NP_st2*prior_math + lang_NP_st2*prior_lang

gen num_pri_NP_st2=prior_port +prior_biol+ prior_chem + prior_hist+ ///
prior_phy + prior_geog + prior_math + prior_lang

/*	Second, calculate the sum of the grades of non-priority disciplines and then the number of non-priority disciplines required in that career choice,
	which is, by definition, 8 minus the number of priority disciplines.
*/

gen nonpri_NP_st2 = port_NP_st2*(1-prior_port) +biol_NP_st2*(1-prior_biol)+ chem_NP_st2*(1-prior_chem) ///
+hist_NP_st2*(1-prior_hist)+ phy_NP_st2*(1-prior_phy) + geog_NP_st2*(1-prior_geog) + ///
math_NP_st2*(1-prior_math) + lang_NP_st2*(1-prior_lang)

gen num_nonpri_NP_st2=8-num_pri_NP_st2

*	Third, calculate the weighted aptitude test normalized score. The weighted aptitude test is weighted by 2 except for dentistry. 

gen wapt_NP = 2* apt_NP if req_apt == 1
replace wapt_NP = apt_NP if career_choice == 14
replace wapt_NP = 0 if req_apt == 0

*	Fourth, calculate the final score NPO.

gen NPO = .
replace NPO = 2*NP_st1 + 2*pri_NP_st2 +nonpri_NP_st2
replace NPO = 2*NP_st1 + 2*pri_NP_st2 +nonpri_NP_st2 +wapt_NP if req_apt == 1

gen wNPO = NPO/(2+2*num_pri_NP_st2+num_nonpri_NP_st2)
replace wNPO = NPO/(2+2*num_pri_NP_st2+num_nonpri_NP_st2+2) if req_apt == 1
replace wNPO = NPO/(2+2*num_pri_NP_st2+num_nonpri_NP_st2+1) if career_choice == 14

* Fifth, generate the variable with NPO also taking into account the affirmative action policy.

gen wNPO_aa = wNPO
replace wNPO_aa = wNPO+30 if aa_policy==1
replace wNPO_aa = wNPO+40 if aa_policy==2

* Sixth, generate the variable with NPO that does not include Phase 1 score nor affirmative action policy.

gen NPOwo=NPO-2*NP_st1
gen wNPOwo=NPOwo/(2*num_pri_NP_st2+num_nonpri_NP_st2)
replace wNPOwo = NPO/(2*num_pri_NP_st2+num_nonpri_NP_st2+2) if req_apt == 1
replace wNPOwo = NPO/(2*num_pri_NP_st2+num_nonpri_NP_st2+1) if career_choice == 14

***********************************************************************************************************************************************

*****************************************************************************
* Sample restrictions							
*****************************************************************************

* Dropping careers with aptitude test (except for Dentistry)
drop if req_apt==1 & career_choice!=14 
tab year if aa_year==0

* Eliminating students that did not attend a Phase 2 exam
drop if port_attn_st2 == 2 | biol_attn_st2 == 2  | chem_attn_st2 == 2  | hist_attn_st2 == 2  | phy_attn_st2 == 2  | geog_attn_st2 == 2  | math_attn_st2 == 2  | lang_attn_st2 == 2 | (aptd_attn_st2 == 2 & req_apt==1)  // 8,911  observations deleted (7.1 percent of the 2000-2008 sample) 
tab year if aa_year==0 // 4,576 observations deleted (6.6 percent of the 2000-2004 sample)

* This last restriction eliminates all applicants without a final score.
* There is no need, as we eliminated students that did not attend an exam
count if wNPOwo==.

* Dropping Trainees
tab year, sum(trainingnew)
keep if trainingnew==0

* Restrictions based on missing observations

tab sex, mi
drop if  sex==. 

gen missing_priority=0
foreach var of varlist prior_* {
replace missing_priority=1 if `var'==.
}
tab missing_priority mergepriority
drop if missing_priority==1 // 0 obs deleted

* Restriction to concentrate on applicants with 'normal' age to apply
sum agejun agedec if agejun<16 
sum agejun agedec if agejun>27
drop if (agejun<16 | agejun>27) & agejun~=.

** Creating dummy variable for students with missing ENEM scores
replace enem=. if enem==0
gen missing_enem=1 if enem==.
replace missing_enem=0 if enem!=.
sum missing_enem
* Drop Missing ENEM
drop if missing_enem==1

** Drop students with zero priority subjects in our main regressions
egen n_prior_reg=rowtotal(prior_biol prior_chem prior_hist prior_phy prior_geog  prior_math)
drop if n_prior_reg==0
drop n_prior_reg

* Only years before the affirmative action took place, excluding 2000
drop if aa_year==1
tab year
drop if year==2000
tab year

*** Identify retakers
merge 1:1 inscri2 using  "Work Data/ids.dta"
keep if _merge==3
drop _merge

bys id: egen exam_qt=count(inscri2)

tab exam_qt
so id year
by id: gen exam_order=_n
tab exam_order

by id: egen last_year=max(year)
tab exam_order if last_year==year

keep if last_year==year
count
sum female

***********************************************************************************************************************************************

** Merge School Data 

count
so inscri2
merge 1:1 inscri2 using "Work data/schools_identified.dta", gen (merge_school)
drop if merge_school==2
count

* For schools that are not identified in the data, we aggregate the different units of a given school: 

list nome_esc-uf_esc if   co_entidade==35130254  | co_entidade==35133450  | co_entidade==35103433  | co_entidade==35132214  | co_entidade==35133395  | co_entidade==35120339  | co_entidade==35143200  | co_entidade==35140351  | co_entidade==35140363  | co_entidade==35399197
replace co_entidade=9935120339 if co_entidade==35130254  | co_entidade==35133450  | co_entidade==35103433  | co_entidade==35132214  | co_entidade==35133395  | co_entidade==35120339  | co_entidade==35143200  | co_entidade==35140351  | co_entidade==35140363  | co_entidade==35399197

list nome_esc-uf_esc if co_entidade==35134788  | co_entidade==35107542
replace co_entidade=9935107542 if co_entidade==35134788  | co_entidade==35107542

list nome_esc-uf_esc if co_entidade==35114959  | co_entidade==35137273
replace co_entidade=9935114959 if co_entidade==35114959  | co_entidade==35137273

list nome_esc-uf_esc if co_entidade==35132925  | co_entidade==35115009
replace co_entidade=9935132925 if co_entidade==35132925  | co_entidade==35115009

list nome_esc-uf_esc if  co_entidade==35142566  | co_entidade==35139683  | co_entidade==35154404 
replace co_entidade=9935139683 if co_entidade==35142566  | co_entidade==35139683  | co_entidade==35154404 

list nome_esc-uf_esc if co_entidade==35802499  | co_entidade==35134387  | co_entidade==35143492
replace co_entidade=9935134387 if co_entidade==35802499  | co_entidade==35134387  | co_entidade==35143492


***********************************************************************************************************************************************

replace convoc=0 if convoc==.

*****************************************************************************																		
* Normalizing scores (wide shape)							
*****************************************************************************

* Using Applicant ENEM Distribution for Non-Trainees
tab trainingnew
by year, sort: egen enem_bar_w=mean(enem) 
by year, sort: egen enem_sd_w=sd(enem) 
gen norm_enem_w=(enem-enem_bar_w)/enem_sd_w 
by year, sort: sum norm_enem_w norm_enem 

*** Computing Normalized Final Scores NPO
* Normalizing final score by year, 0 to 1.

by year, sort: sum wNPOwo wNPO
by year, sort: egen wNPOwo_bar=mean(wNPOwo)
by year, sort: egen wNPOwo_sd=sd(wNPOwo)
gen norm_wNPOwo=(wNPOwo-wNPOwo_bar)/wNPOwo_sd
by year, sort: sum norm_wNPOwo

by year, sort: egen wNPO_bar=mean(wNPO)
by year, sort: egen wNPO_sd=sd(wNPO)
gen norm_wNPO=(wNPO-wNPO_bar)/wNPO_sd
by year, sort: sum norm_wNPO

***********************************************************************************************************************************************

*****************************************************************************																		
* Item difficulty measure						
*****************************************************************************

* the simplest approach would be to calculate, for each P2 question, average score/5. 

foreach x in port biol chem hist phy geog math lang {
forvalues i=1(1)12 {
sum `x'`i'_st2
bys year: egen average`x'`i'_st2=mean(`x'`i'_st2)
gen item_difficulty`i'`x'=average`x'`i'_st2/5 
replace item_difficulty`i'`x'=average`x'`i'_st2/4 if year==2008
sum item_diff*
}
}

***********************************************************************************************************************************************

*****************************************************************************																		
* ENEM deciles			
*****************************************************************************

* Generating number to avoid the problem caused by "unique" which randomly divides ties
set seed 1234
sort inscri2
gen double  r_number=runiform()/1000
gen double norm_ranking=enem+r_number
sum norm_ranking enem, detail

* Creating ENEM deciles
so year (norm_ranking), stable
egen deciles=xtile(norm_ranking), nq(10) by(year)
tab deciles, sum(enem)
tab deciles year, sum(enem)

so year (norm_ranking), stable
by year: gen deciles_equal = int(10*(_n-1)/_N)+1
tab deciles_equal, sum(enem)
tab deciles_equal year, sum(enem)

tab deciles deciles_equal

*****************************************************************************																		
* Changing Data From Wide to Long							
*****************************************************************************

* Renaming scores before converting to long shape
foreach x in port biol chem hist phy geog math lang {
rename `x'_tot_st2 score`x'
rename `x'_zerostotal_st2 totalzeros`x'
rename `x'_zeros1to4_st2 zeros1to4`x'
rename `x'_zeros5to8_st2 zeros5to8`x'
rename `x'_zeros9to12_st2 zeros9to12`x'
rename `x'_stand_dev_st2 stand_dev`x'
rename `x'_maxtotal_st2 maxtotal`x'
rename `x'_max1to4_st2 max1to4`x'
rename `x'_max5to8_st2 max5to8`x'
rename `x'_max9to12_st2 max9to12`x'
}

sum score* totalzeros* zeros1to4* zeros5to8*  zeros9to12* stand_dev* maxtotal* max1to4* max5to8* max9to12* any_maxscore* any_zero* ///
totalmissing* missing1to4* missing5to8* missing9to12* coef_var* zerosP1* total_zeros_missing*

* Changing Data From Wide to Long
reshape long score totalzeros zeros1to4 zeros5to8 zeros9to12 stand_dev maxtotal max1to4 max5to8 max9to12 total_zeros_missing total_zeros_missing_1to4 ///
total_zeros_missing_5to8 total_zeros_missing_9to12 totalmissing missing1to4 missing5to8 missing9to12 any_maxscore any_zero coef_var zerosP1 ///
item_difficulty1 item_difficulty2 item_difficulty3 item_difficulty4 item_difficulty5 item_difficulty6 ///
item_difficulty7 item_difficulty8 item_difficulty9 item_difficulty10 item_difficulty11 item_difficulty12  , i(inscri2) j(subject) string

*  one observation per subject in Phase 2 (8 lines per student)
tab subject
sum score totalzeros zeros1to4 zeros5to8 zeros9to12 coef_var stand_dev maxtotal max1to4 max5to8 max9to12 any_maxscore any_zero 
sum totalmissing missing1to4 missing5to8 missing9to12  
sum item_difficulty*
sum total_zeros_missing*

* Creating dummy variable for priority disciplines
gen priority=.
foreach x in port biol chem hist phy geog math lang {
replace priority=1 if subject=="`x'" & prior_`x'==1
replace priority=0 if subject=="`x'" & prior_`x'==0
}
tab priority,mi
tab subject,sum(priority)

*** Normalizing Phase 2 scores by subject and year
bys year subject: sum score
bys year subject: egen mean_score=mean(score)
bys year subject: egen sd_score=sd(score)
gen norm_score =(score-mean_score)/sd_score
bys year subject: sum norm_score

*** Normalizing Phase 2 scores by subject, year and gender
bys year subject female: egen mean_score_g=mean(score)
bys year subject female: egen sd_score_g=sd(score)
gen norm_score_g =(score-mean_score_g)/sd_score_g
bys year subject female: sum norm_score_g
tab subject, sum (norm_score_g)

*** Creating new variable to summarize columns of phase 1 scores

/* Each P1 scores will be a column in the data set. We will create a new variable that will summarize p1 scores. 
The new variable will take the value of the p1 score of the same subject that we are analyzing in Phase 2.

In Phase 1 the following subjects are not included in general questions: Portuguese and Foreign Language
For Portuguese, we can use the essay score, but for foreign language we will have missing information in this variable

*/

gen p1score=essay_st1 if subject=="port"
foreach x in biol chem hist phy geog math  {
replace p1score=tot_`x'_p1 if subject=="`x'"
}
tab subject, sum (p1score)

*** Normalizing Phase 1 scores by subject and year
* as essay have a different total score, I will normalize the scores
bys year subject: egen mean_p1score=mean(p1score)
bys year subject: egen sd_p1score=sd(p1score)
gen norm_p1score =(p1score-mean_p1score)/sd_p1score
bys year subject: sum norm_p1score
tab subject, sum (norm_p1score)

*** Normalizing Phase 1 scores by subject, year and gender
bys year subject female: egen mean_p1score_g=mean(p1score)
bys year subject female: egen sd_p1score_g=sd(p1score)
gen norm_p1score_g =(p1score-mean_p1score_g)/sd_p1score_g
bys year subject female: sum norm_p1score_g
tab subject, sum (norm_p1score_g)

save "Work Data/Gender_Phase2_long.dta", replace







